from botasaurus import *
from botasaurus.browser_decorator import browser
from botasaurus_driver.user_agent import UserAgent
import csv
import os
import time
import random
from urllib.parse import urljoin


# --- 辅助函数 ---

def update_master_status(cities_file, city_name, new_status):
    """
    一个专用的函数，用于读取主控CSV，更新特定城市的状态，然后写回文件。
    这是实现“完成”状态持久化的关键。
    """
    try:
        with open(cities_file, mode='r', encoding='utf-8', newline='') as file:
            rows = list(csv.DictReader(file))

        fieldnames = rows[0].keys() if rows else ["city", "url", "status"]

        city_found = False
        for row in rows:
            if row.get('city') == city_name:
                row['status'] = new_status
                city_found = True
                break

        if not city_found:
            print(f"警告: 尝试更新主控文件状态时，未找到城市 '{city_name}'。")
            return

        with open(cities_file, mode='w', encoding='utf-8', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(rows)

        print(f"主控文件状态更新: 城市 '{city_name}' 的状态已设置为 '{new_status}'。")

    except FileNotFoundError:
        print(f"严重错误: 无法找到主控文件 '{cities_file}' 来更新状态。")
    except Exception as e:
        print(f"严重错误: 更新主控文件 '{cities_file}' 时发生异常: {e}")


def load_or_initialize_pending_tasks(pending_file, cities_file):
    """
    根据主控文件(cities_file)构建权威的待办任务列表。
    【最终逻辑】: cities_file 是唯一的真相来源。pending_file 只用作断点续采的进度缓存。
    """
    # 1. 将 pending_file 中已有的进度加载到一个字典中，方便快速查找
    progress_cache = {}
    if os.path.exists(pending_file) and os.path.getsize(pending_file) > 0:
        try:
            with open(pending_file, mode='r', encoding='utf-8', newline='') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    if 'city' in row and 'next_page_url' in row:
                        progress_cache[row['city']] = row['next_page_url']
            if progress_cache:
                print(f"成功从 '{pending_file}' 加载了 {len(progress_cache)} 个任务的现有进度。")
        except Exception as e:
            print(f"警告: 读取 '{pending_file}' 进度时出错，将忽略已有进度: {e}")

    # 2. 遍历主控文件，构建最终的、权威的任务列表
    final_task_list = []
    try:
        with open(cities_file, mode='r', encoding='utf-8', newline='') as file:
            reader = csv.DictReader(file)

            for row in reader:
                # 如果一个城市的状态不是'0'，就直接跳过
                if row.get('status') != '0':
                    continue

                city_name = row.get('city')
                base_url = row.get('url')

                if not (city_name and base_url):
                    print(f"警告: 在 '{cities_file}' 中发现不完整的行，已跳过。行数据: {row}")
                    continue

                # 检查该任务是否有已保存的进度，如果没有，则使用其原始URL
                next_page_url = progress_cache.get(city_name, base_url)

                final_task_list.append({
                    "city": city_name,
                    "base_url": base_url,
                    "next_page_url": next_page_url,
                })

        # 3. `final_task_list` 现在是需要执行的所有任务的完整列表
        if final_task_list:
            print(f"同步完成。根据主控文件，共有 {len(final_task_list)} 个待处理任务。")
        else:
            print("同步完成。主控文件中没有发现任何待处理的任务 (status='0')。")

        # 用这个最准确的列表覆盖更新 pending_file
        update_pending_file(pending_file, final_task_list)

        return final_task_list

    except FileNotFoundError:
        print(f"严重错误: 主控文件 '{cities_file}' 不存在。无法继续。")
        return []
    except Exception as e:
        print(f"处理主控文件时发生未知错误: {e}")
        return []


def update_pending_file(filepath, tasks_list):
    """
    一个专用的函数，用于将当前的任务列表状态写入 pending_file。
    """
    if not tasks_list:
        if os.path.exists(filepath):
            os.remove(filepath)
            print("所有任务完成，已删除待办任务文件。")
        return

    try:
        fieldnames = ["city", "base_url", "next_page_url"]
        with open(filepath, mode='w', encoding='utf-8', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(tasks_list)
    except Exception as e:
        print(f"严重错误：无法更新待办任务文件 '{filepath}': {e}")


def load_existing_restaurant_urls(filepath):
    """
    加载所有已保存的餐厅URL到集合中，并进行规范化去重。
    """
    if not os.path.exists(filepath):
        with open(filepath, mode='w', encoding='utf-8', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=["country", "city", "url"])
            writer.writeheader()
        return set()

    with open(filepath, mode='r', encoding='utf-8', newline='') as file:
        if os.fstat(file.fileno()).st_size == 0:
            return set()
        reader = csv.DictReader(file)
        return {row['url'].split('#')[0] for row in reader if row and 'url' in row}


def extract_new_urls_on_page(driver, country, city, current_page_url, existing_urls_set):
    """
    从页面提取新URL，规范化并去重。
    """
    new_restaurants_to_write = []
    new_urls_on_page = []
    try:
        elements = driver.select_all("div.rfqBq a.BMQDV[href*='/Restaurant_Review-']")
        for element in elements:
            relative_url = element.get_attribute("href")
            if relative_url:
                absolute_url = urljoin(current_page_url, relative_url)
                normalized_url = absolute_url.split('#')[0]

                if normalized_url not in existing_urls_set:
                    existing_urls_set.add(normalized_url)
                    new_restaurants_to_write.append({"country": country, "city": city, "url": normalized_url})
                    new_urls_on_page.append(normalized_url)
    except Exception as e:
        print(f"从 {current_page_url} 提取URL时出错: {e}")
    return new_restaurants_to_write, new_urls_on_page


def browse_random_restaurant_pages(driver, urls_to_browse):
    """
    随机浏览详情页。
    """
    if not urls_to_browse:
        return
    pages_to_browse_count = random.randint(0, min(2, len(urls_to_browse)))
    if pages_to_browse_count == 0:
        return

    print(f"将随机浏览 {pages_to_browse_count} 个餐厅详情页...")
    urls_to_visit = random.sample(urls_to_browse, pages_to_browse_count)

    for url in urls_to_visit:
        try:
            driver.get(url)
            time.sleep(random.uniform(10, 15))
        except Exception as e:
            print(f"访问随机详情页 {url} 时出错: {e}")


# --- 主爬虫函数 ---

@browser(
    user_agent=UserAgent.REAL,
    chrome_executable_path=r"C:\Program Files\Google\Chrome\Application\chrome.exe",
    cache=True,
    block_images=True,
    lang="zh-CN",
    reuse_driver=True,
    close_on_crash=True,
    create_error_logs=True,
)
def scrape_restaurant_urls(driver, data, country=None):
    if country is None and data is not None:
        country = data.get("country")
    if country is None:
        raise ValueError("必须提供 country 参数或在 data 中包含 country 键")
        
    city_urls_file = f"{country}_city_urls.csv"
    restaurant_urls_file = f"{country}_restaurant_urls.csv"
    pending_file = f"{country}_pending_pages.csv"

    pending_tasks = load_or_initialize_pending_tasks(pending_file, city_urls_file)
    print(f"[调试] load_or_initialize_pending_tasks 返回了 {len(pending_tasks)} 个任务。")
    if not pending_tasks:
        print("所有采集任务均已完成，或无法初始化任务。")
        return 0

    existing_urls = load_existing_restaurant_urls(restaurant_urls_file)
    print(f"已加载 {len(existing_urls)} 个已存在的餐厅URL用于去重。")

    with open(restaurant_urls_file, mode='a', encoding='utf-8', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=["country", "city", "url"])
        task_index = 0
        while pending_tasks:
            task_index += 1
            current_task = pending_tasks[0]
            city = current_task["city"]
            current_page_url = current_task["next_page_url"]

            print(f"\n[调试] --- 开始处理第 {task_index} 个任务: 城市: {city}, 页面: {current_page_url} ---")

            try:
                driver.get(current_page_url)
                time.sleep(random.uniform(10, 20))
            except Exception as e:
                print(f"[调试] 无法加载页面 {current_page_url}: {e}。为安全起见，将此任务移除。")
                print(f"[调试] 移除任务前 pending_tasks 长度: {len(pending_tasks)}")
                pending_tasks.pop(0)
                print(f"[调试] 移除任务后 pending_tasks 长度: {len(pending_tasks)}")
                update_pending_file(pending_file, pending_tasks)
                continue
            new_restaurants, new_urls = extract_new_urls_on_page(driver, country, city, current_page_url, existing_urls)
            if new_restaurants:
                writer.writerows(new_restaurants)
                file.flush()
                print(f"在当前页面新发现并保存了 {len(new_restaurants)} 个不重复的URL。")

            browse_random_restaurant_pages(driver, new_urls)

            try:
                driver.get(current_page_url)
                time.sleep(random.uniform(5, 10))
            except Exception as e:
                print(f"[调试] 无法重新加载列表页 {current_page_url}: {e}。终止当前城市的采集。")
                print(f"[调试] 移除任务前 pending_tasks 长度: {len(pending_tasks)}")
                pending_tasks.pop(0)
                print(f"[调试] 移除任务后 pending_tasks 长度: {len(pending_tasks)}")
                update_pending_file(pending_file, pending_tasks)
                continue

            try:
                next_page_element = driver.select("div.IGLCo a.BrOJk[aria-label='下一页']", wait=5)
                if next_page_element and next_page_element.get_attribute("href"):
                    next_page_url = urljoin(current_page_url, next_page_element.get_attribute("href"))
                    pending_tasks[0]["next_page_url"] = next_page_url
                    print(f"找到下一页，任务已更新为: {next_page_url}")
                    update_pending_file(pending_file, pending_tasks)
                else:
                    print(f"城市 '{city}' 已无下一页，采集完成。")
                    pending_tasks.pop(0)
                    update_pending_file(pending_file, pending_tasks)
                    update_master_status(city_urls_file, city, "1")
            except Exception:
                print(f"[调试] 在城市 '{city}' 未找到下一页按钮或查找时出错。")
                print(f"[调试] 移除任务前 pending_tasks 长度: {len(pending_tasks)}")
                pending_tasks.pop(0)
                print(f"[调试] 移除任务后 pending_tasks 长度: {len(pending_tasks)}")
                update_pending_file(pending_file, pending_tasks)

    return len(existing_urls)


# --- 运行爬虫 ---

if __name__ == "__main__":
    start_time = time.time()

    # 1. 定义你要处理的国家列表
    countries_to_scrape = ["乌兹别克斯坦"]

    # 2. 循环处理每个国家
    for country in countries_to_scrape:
        print(f"\n=========================================")
        print(f"开始处理国家: {country}")
        print(f"=========================================")

        # 确保该国家的主控文件存在
        master_file = f"{country}_city_urls.csv"
        if not os.path.exists(master_file):
            print(f"错误: 找不到主控文件 '{master_file}'。请先准备好该文件。")
            continue # 跳过这个国家，继续下一个

        # 3. 调用主函数，并将 country 作为参数传入
        # 注意：Botasaurus v4的调用方式
        scrape_restaurant_urls(data={"country": country})

    print("\n所有国家的任务都已处理完毕。")
    elapsed_time = time.time() - start_time
    print(f"总耗时: {elapsed_time:.6f} 秒 ({elapsed_time * 1000:.2f} 毫秒)。")